library(knitr)
# install.packages("readr")
library(readr)
library(corrplot)
## corrplot 0.92 loaded
# Karolina:
# setwd("C:/Users/karla/OneDrive/Pulpit/Credit Risk/Projekt")
# app_train3<-read_csv("app_test3.csv")
# app_test3<-read_csv("app_train3.csv")
# Zuzia:
setwd("/Users/zuzanna/Desktop/Studia/CreditRisk/projekt")
load("dane/app_train_prep.RData")
load("dane/app_test_prep.RData")
This is an overview of our initially prepared data. We extracted the applicant’s profiles from Kaggle.
The summary of Train Data.
summary(app_train3)
## SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER
## Min. :100002 Min. :0.00000 Length:307511 Length:307511
## 1st Qu.:189146 1st Qu.:0.00000 Class :character Class :character
## Median :278202 Median :0.00000 Mode :character Mode :character
## Mean :278180 Mean :0.08073
## 3rd Qu.:367142 3rd Qu.:0.00000
## Max. :456255 Max. :1.00000
##
## FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL
## Length:307511 Length:307511 Min. : 0.0000 Min. : 25650
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 112500
## Mode :character Mode :character Median : 0.0000 Median : 147150
## Mean : 0.4171 Mean : 168798
## 3rd Qu.: 1.0000 3rd Qu.: 202500
## Max. :19.0000 Max. :117000000
##
## AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE
## Min. : 45000 Min. : 1616 Min. : 40500 Length:307511
## 1st Qu.: 270000 1st Qu.: 16524 1st Qu.: 238500 Class :character
## Median : 513531 Median : 24903 Median : 450000 Mode :character
## Mean : 599026 Mean : 27109 Mean : 538396
## 3rd Qu.: 808650 3rd Qu.: 34596 3rd Qu.: 679500
## Max. :4050000 Max. :258026 Max. :4050000
## NA's :12 NA's :278
## NAME_EDUCATION_TYPE NAME_FAMILY_STATUS DAYS_EMPLOYED REGION_RATING_CLIENT
## Length:307511 Length:307511 Min. :-17912 Min. :1.000
## Class :character Class :character 1st Qu.: -2760 1st Qu.:2.000
## Mode :character Mode :character Median : -1213 Median :2.000
## Mean : 63815 Mean :2.052
## 3rd Qu.: -289 3rd Qu.:2.000
## Max. :365243 Max. :3.000
##
## DOCUMENT EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3
## Min. :0.00000 Min. :0.01 Min. :0.0000 Min. :0.00
## 1st Qu.:0.05000 1st Qu.:0.33 1st Qu.:0.3925 1st Qu.:0.37
## Median :0.05000 Median :0.51 Median :0.5660 Median :0.54
## Mean :0.04651 Mean :0.50 Mean :0.5144 Mean :0.51
## 3rd Qu.:0.05000 3rd Qu.:0.68 3rd Qu.:0.6636 3rd Qu.:0.67
## Max. :0.20000 Max. :0.96 Max. :0.8550 Max. :0.90
## NA's :173378 NA's :660 NA's :60965
## AMT_REQ_CREDIT_BUREAU_QRT
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 0.27
## 3rd Qu.: 0.00
## Max. :261.00
## NA's :41519
The summary of Test Data.
summary(app_test3)
## SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
## Min. :100001 Length:48744 Length:48744 Length:48744
## 1st Qu.:188558 Class :character Class :character Class :character
## Median :277549 Mode :character Mode :character Mode :character
## Mean :277797
## 3rd Qu.:367556
## Max. :456250
##
## FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
## Length:48744 Min. : 0.0000 Min. : 26942 Min. : 45000
## Class :character 1st Qu.: 0.0000 1st Qu.: 112500 1st Qu.: 260640
## Mode :character Median : 0.0000 Median : 157500 Median : 450000
## Mean : 0.3971 Mean : 178432 Mean : 516740
## 3rd Qu.: 1.0000 3rd Qu.: 225000 3rd Qu.: 675000
## Max. :20.0000 Max. :4410000 Max. :2245500
##
## AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_EDUCATION_TYPE
## Min. : 2295 Min. : 45000 Length:48744 Length:48744
## 1st Qu.: 17973 1st Qu.: 225000 Class :character Class :character
## Median : 26199 Median : 396000 Mode :character Mode :character
## Mean : 29426 Mean : 462619
## 3rd Qu.: 37390 3rd Qu.: 630000
## Max. :180576 Max. :2245500
## NA's :24
## NAME_FAMILY_STATUS DAYS_EMPLOYED REGION_RATING_CLIENT DOCUMENT
## Length:48744 Min. :-17463 Min. :1.000 Min. :0.00000
## Class :character 1st Qu.: -2910 1st Qu.:2.000 1st Qu.:0.05000
## Mode :character Median : -1293 Median :2.000 Median :0.05000
## Mean : 67485 Mean :2.038 Mean :0.04923
## 3rd Qu.: -296 3rd Qu.:2.000 3rd Qu.:0.05000
## Max. :365243 Max. :3.000 Max. :0.05000
##
## EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 AMT_REQ_CREDIT_BUREAU_QRT
## Min. :0.013 Min. :0.000008 Min. :0.001 Min. :0.000
## 1st Qu.:0.344 1st Qu.:0.408066 1st Qu.:0.364 1st Qu.:0.000
## Median :0.507 Median :0.558758 Median :0.519 Median :0.000
## Mean :0.501 Mean :0.518021 Mean :0.500 Mean :0.547
## 3rd Qu.:0.666 3rd Qu.:0.658497 3rd Qu.:0.653 3rd Qu.:1.000
## Max. :0.939 Max. :0.855000 Max. :0.883 Max. :7.000
## NA's :20532 NA's :8 NA's :8668 NA's :6049
unique(app_train3$TARGET)
## [1] 1 0
app_train3$TARGET <- as.numeric(app_train3$TARGET)
summary(app_train3$TARGET)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08073 0.00000 1.00000
targetTR <- table(app_train3$TARGET)
barplot(targetTR, main = "Target variable is train sample",
ylab = "Count",
names.arg = c("0", "1"),
col = c("grey", "orange"))
We can observe the contract type is a text variable. We need to
transform it to categorical data type.
unique(app_train3$NAME_CONTRACT_TYPE)
## [1] "Cash loans" "Revolving loans"
unique(app_test3$NAME_CONTRACT_TYPE)
## [1] "Cash loans" "Revolving loans"
After transformation:
Cash loans = 0 and Revolving loans = 1
## [1] 0 1
## [1] TRUE
## [1] TRUE
##
## 0 1
## 278232 29279
##
## 0 1
## 48305 439
We can observe the gender is a text variable. We need to transform it
to binary data type.
unique(app_train3$CODE_GENDER)
## [1] "M" "F" "XNA"
unique(app_test3$CODE_GENDER)
## [1] "F" "M"
After transformation:
## [1] 0 1
## [1] 1 0
## [1] TRUE
## [1] TRUE
We can observe the owning car is a text variable. We need to
transform it to binary data type.
unique(app_train3$FLAG_OWN_CAR)
## [1] "N" "Y"
unique(app_test3$FLAG_OWN_CAR)
## [1] "N" "Y"
After transformation:
## [1] 0 1
## [1] 0 1
## [1] TRUE
## [1] TRUE
We can observe the owning a flat or house is a text variable. We need
to transform it to binary data type.
unique(app_train3$FLAG_OWN_REALTY)
## [1] "Y" "N"
unique(app_test3$FLAG_OWN_REALTY)
## [1] "Y" "N"
After transformation:
## [1] 1 0
## [1] 1 0
## [1] TRUE
## [1] TRUE
unique(app_train3$CNT_CHILDREN)
## [1] 0 1 2 3 4 7 5 6 8 9 11 12 10 19 14
unique(app_test3$CNT_CHILDREN)
## [1] 0 2 1 3 8 4 6 5 7 20 11
In this case, we will treat 4 children and more as multiple families. We need to cap the data.
Transform the variable to binary.
On this stage, our children variable is balanced for train sample and imbalanced for test sample.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26942 112500 157500 178432 225000 4410000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25650 112500 147150 168798 202500 117000000
We can observe that the outliers are only form the right side.
We use winsorization.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45000 260640 450000 516740 675000 2245500
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45000 270000 513531 599026 808650 4050000
Applying logarithmic transformation.
app_train3$AMT_CREDIT <- log(app_train3$AMT_CREDIT)
app_test3$AMT_CREDIT <- log(app_test3$AMT_CREDIT)
hist(app_test3$AMT_CREDIT,
main = "Distribution of CA in test sample after logarithimic transformation",
xlab = "Credit Total")
hist(app_train3$AMT_CREDIT,
main = "Distribution of CA in train sample after logarithimic transformation",
xlab = "Credit Total")
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2295 17973 26199 29426 37390 180576 24
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1616 16524 24903 27109 34596 258026 12
We need to handle NA’s (by imputation), and distribution applying
logarithmic transformation.
# Summary statistics of AMT_GOODS_PRICE
summary(app_test3$AMT_GOODS_PRICE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45000 225000 396000 462619 630000 2245500
summary(app_train3$AMT_GOODS_PRICE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 40500 238500 450000 538396 679500 4050000 278
# Create a subset of the data without missing values
no_missing_train <- app_train3$AMT_GOODS_PRICE[!is.na(app_train3$AMT_GOODS_PRICE)]
no_missing_test <- app_test3$AMT_GOODS_PRICE[!is.na(app_test3$AMT_GOODS_PRICE)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$AMT_GOODS_PRICE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$AMT_GOODS_PRICE)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$AMT_GOODS_PRICE[is.na(app_train3$AMT_GOODS_PRICE)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$AMT_GOODS_PRICE[is.na(app_test3$AMT_GOODS_PRICE)] <- random_values_test
# Plot the distribution of AMT_GOODS_PRICE
hist(app_test3$AMT_GOODS_PRICE,
main = "Distribution of Goods Price in Test Sample",
xlab = "Goods Price")
hist(app_train3$AMT_GOODS_PRICE,
main = "Distribution of Goods Price in Train Sample",
xlab = "Goods Price")
# Boxplots
boxplot(app_test3$AMT_GOODS_PRICE,
main = "Boxplot of Goods Price in Test Sample",
ylab = "Goods Price")
boxplot(app_train3$AMT_GOODS_PRICE,
main = "Boxplot of Goods Price in Train Sample",
ylab = "Goods Price")
Apply the logarithmic transformation.
app_test3$AMT_GOODS_PRICE <- log(app_test3$AMT_GOODS_PRICE)
app_train3$AMT_GOODS_PRICE <- log(app_train3$AMT_GOODS_PRICE)
# Plot the distribution of AMT_GOODS_PRICE
hist(app_test3$AMT_GOODS_PRICE,
main = "Distribution of Goods Price in Test Sample after transformation",
xlab = "Goods Price")
hist(app_train3$AMT_GOODS_PRICE,
main = "Distribution of Goods Price in Train Sample after transformation",
xlab = "Goods Price")
# Summary statistics of NAME_TYPE_SUITE
summary(app_test3$NAME_TYPE_SUITE)
## Length Class Mode
## 48744 character character
summary(app_train3$NAME_TYPE_SUITE)
## Length Class Mode
## 307511 character character
# We can observe the NAME_TYPE_SUITE is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_TYPE_SUITE)
## [1] "Unaccompanied" "Family" "Spouse, partner" "Children"
## [5] "Other_A" NA "Other_B" "Group of people"
unique(app_test3$NAME_TYPE_SUITE)
## [1] "Unaccompanied" NA "Family" "Spouse, partner"
## [5] "Group of people" "Other_B" "Children" "Other_A"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Unaccompanied" = 1, "Family" = 2, "Spouse, partner" = 3, "Group of people" = 4, "Other_B" = 5, "Children" = 6, "Other_A" = 7)
# Then convert the "NAME_TYPE_SUITE" column to a factor with the specified categories
app_train3$NAME_TYPE_SUITE <- factor(app_train3$NAME_TYPE_SUITE, levels = names(category_mapping))
app_test3$NAME_TYPE_SUITE <- factor(app_test3$NAME_TYPE_SUITE, levels = names(category_mapping))
# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_TYPE_SUITE <- as.integer(app_train3$NAME_TYPE_SUITE)
app_test3$NAME_TYPE_SUITE <- as.integer(app_test3$NAME_TYPE_SUITE)
# Now we can verify the changes
unique(app_test3$NAME_TYPE_SUITE)
## [1] 1 NA 2 3 4 5 6 7
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_TYPE_SUITE[!is.na(app_train3$NAME_TYPE_SUITE)]
no_missing_test <- app_test3$NAME_TYPE_SUITE[!is.na(app_test3$NAME_TYPE_SUITE)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_TYPE_SUITE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_TYPE_SUITE)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$NAME_TYPE_SUITE[is.na(app_train3$NAME_TYPE_SUITE)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$NAME_TYPE_SUITE[is.na(app_test3$NAME_TYPE_SUITE)] <- random_values_test
# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_TYPE_SUITE,
main = "Distribution of type suite in test sample",
xlab = "Type suite")
hist(app_train3$NAME_TYPE_SUITE,
main = "Distribution of type suite in train sample",
xlab = "Type suite")
After checking the distribution, we decide to parse the NAME_TYPE_SUITE as binary variable.
app_test3$NAME_TYPE_SUITE <- ifelse(app_test3$NAME_TYPE_SUITE == '1', 0, 1)
app_train3$NAME_TYPE_SUITE <- ifelse(app_train3$NAME_TYPE_SUITE == '1', 0, 1)
hist(app_test3$NAME_TYPE_SUITE,
main = "Distribution of type suite in test sample",
xlab = "Type suite")
hist(app_train3$NAME_TYPE_SUITE,
main = "Distribution of type suite in train sample",
xlab = "Type suite")
# Summary statistics of NAME_EDUCATION_TYPE
summary(app_test3$NAME_EDUCATION_TYPE)
## Length Class Mode
## 48744 character character
summary(app_train3$NAME_EDUCATION_TYPE)
## Length Class Mode
## 307511 character character
# We can observe the NAME_EDUCATION_TYPE is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_EDUCATION_TYPE)
## [1] "Secondary / secondary special" "Higher education"
## [3] "Incomplete higher" "Lower secondary"
## [5] "Academic degree"
unique(app_test3$NAME_EDUCATION_TYPE)
## [1] "Higher education" "Secondary / secondary special"
## [3] "Incomplete higher" "Lower secondary"
## [5] "Academic degree"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Lower secondary" = 1, "Secondary / secondary special" = 2, "Incomplete higher" = 3, "Higher education" = 4, "Academic degree" = 5)
# Then convert the "NAME_EDUCATION_TYPE" column to a factor with the specified categories
app_train3$NAME_EDUCATION_TYPE <- factor(app_train3$NAME_EDUCATION_TYPE, levels = names(category_mapping))
app_test3$NAME_EDUCATION_TYPE <- factor(app_test3$NAME_EDUCATION_TYPE, levels = names(category_mapping))
# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_EDUCATION_TYPE <- as.integer(app_train3$NAME_EDUCATION_TYPE)
app_test3$NAME_EDUCATION_TYPE <- as.integer(app_test3$NAME_EDUCATION_TYPE)
# Now we can verify the changes
unique(app_test3$NAME_EDUCATION_TYPE)
## [1] 4 2 3 1 5
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_EDUCATION_TYPE[!is.na(app_train3$NAME_EDUCATION_TYPE)]
no_missing_test <- app_test3$NAME_EDUCATION_TYPE[!is.na(app_test3$NAME_EDUCATION_TYPE)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_EDUCATION_TYPE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_EDUCATION_TYPE)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$NAME_EDUCATION_TYPE[is.na(app_train3$NAME_EDUCATION_TYPE)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$NAME_EDUCATION_TYPE[is.na(app_test3$NAME_EDUCATION_TYPE)] <- random_values_test
# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_EDUCATION_TYPE,
main = "Distribution of education type in test sample",
xlab = "education type")
hist(app_train3$NAME_EDUCATION_TYPE,
main = "Distribution of education type in train sample",
xlab = "education type")
After checking the distribution, we decide to parse the NAME_EDUCATION_TYPE as binary variable. Higher education and above is 1 and lower is 0.
app_test3$NAME_EDUCATION_TYPE <- ifelse(app_test3$NAME_EDUCATION_TYPE == '1', 0,
(ifelse(app_test3$NAME_EDUCATION_TYPE == '2', 0, 1)))
app_train3$NAME_EDUCATION_TYPE <- ifelse(app_train3$NAME_EDUCATION_TYPE == '1', 0,
(ifelse(app_train3$NAME_EDUCATION_TYPE == '2', 0, 1)))
hist(app_test3$NAME_EDUCATION_TYPE,
main = "Distribution of education in test sample",
xlab = "Educ")
hist(app_train3$NAME_EDUCATION_TYPE,
main = "Distribution of education in train sample",
xlab = "Educ")
# Summary statistics of NAME_FAMILY_STATUS
summary(app_test3$NAME_FAMILY_STATUS)
## Length Class Mode
## 48744 character character
summary(app_train3$NAME_FAMILY_STATUS)
## Length Class Mode
## 307511 character character
# We can observe the NAME_FAMILY_STATUS is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_FAMILY_STATUS)
## [1] "Single / not married" "Married" "Civil marriage"
## [4] "Widow" "Separated" "Unknown"
unique(app_test3$NAME_FAMILY_STATUS)
## [1] "Married" "Single / not married" "Civil marriage"
## [4] "Widow" "Separated"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Single / not married" = 1, "Civil marriage" = 2, "Married" = 3, "Separated" = 4, "Widow" = 5)
# Then convert the "NAME_FAMILY_STATUS" column to a factor with the specified categories
app_train3$NAME_FAMILY_STATUS <- factor(app_train3$NAME_FAMILY_STATUS, levels = names(category_mapping))
app_test3$NAME_FAMILY_STATUS <- factor(app_test3$NAME_FAMILY_STATUS, levels = names(category_mapping))
# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_FAMILY_STATUS <- as.integer(app_train3$NAME_FAMILY_STATUS)
app_test3$NAME_FAMILY_STATUS <- as.integer(app_test3$NAME_FAMILY_STATUS)
# Now we can verify the changes
unique(app_test3$NAME_FAMILY_STATUS)
## [1] 3 1 2 5 4
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_FAMILY_STATUS[!is.na(app_train3$NAME_FAMILY_STATUS)]
no_missing_test <- app_test3$NAME_FAMILY_STATUS[!is.na(app_test3$NAME_FAMILY_STATUS)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_FAMILY_STATUS)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_FAMILY_STATUS)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$NAME_FAMILY_STATUS[is.na(app_train3$NAME_FAMILY_STATUS)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$NAME_FAMILY_STATUS[is.na(app_test3$NAME_FAMILY_STATUS)] <- random_values_test
# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_FAMILY_STATUS,
main = "Distribution of family status in test sample",
xlab = "family status")
hist(app_train3$NAME_FAMILY_STATUS,
main = "Distribution of family status in train sample",
xlab = "family status")
# Summary statistics of DAYS_EMPLOYED
summary(app_test3$DAYS_EMPLOYED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17463 -2910 -1293 67485 -296 365243
summary(app_train3$DAYS_EMPLOYED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17912 -2760 -1213 63815 -289 365243
sum(is.na(app_test3$DAYS_EMPLOYED))
## [1] 0
sum(is.na(app_train3$DAYS_EMPLOYED))
## [1] 0
# Plot the distribution of DAYS_EMPLOYED
hist(app_test3$DAYS_EMPLOYED,
main = "Distribution of employed days in test sample",
xlab = "employed days")
hist(app_train3$DAYS_EMPLOYED,
main = "Distribution of employed days in train sample",
xlab = "employed days")
# Summary statistics of REGION_RATING_CLIENT
summary(app_test3$REGION_RATING_CLIENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.038 2.000 3.000
summary(app_train3$REGION_RATING_CLIENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.052 2.000 3.000
sum(is.na(app_test3$REGION_RATING_CLIENT))
## [1] 0
sum(is.na(app_train3$REGION_RATING_CLIENT))
## [1] 0
# Plot the distribution of REGION_RATING_CLIENT
hist(app_test3$REGION_RATING_CLIENT,
main = "Distribution of region rating client in test sample",
xlab = "region rating client")
hist(app_train3$REGION_RATING_CLIENT,
main = "Distribution of region rating client in train sample",
xlab = "region rating client")
# Summary statistics of DOCUMENT
summary(app_test3$DOCUMENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05000 0.05000 0.04923 0.05000 0.05000
summary(app_train3$DOCUMENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05000 0.05000 0.04651 0.05000 0.20000
sum(is.na(app_test3$DOCUMENT))
## [1] 0
sum(is.na(app_train3$DOCUMENT))
## [1] 0
app_test3$DOCUMENT <- ifelse(app_test3$DOCUMENT > 0, 1, 0)
app_train3$DOCUMENT <- ifelse(app_train3$DOCUMENT > 0, 1, 0)
# Plot the distribution of DOCUMENT
hist(app_test3$DOCUMENT,
main = "Distribution of DOCUMENT in test sample",
xlab = "DOCUMENT")
hist(app_train3$DOCUMENT,
main = "Distribution of DOCUMENT in train sample",
xlab = "DOCUMENT")
# Summary statistics of EXT_SOURCE_1
summary(app_test3$EXT_SOURCE_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.013 0.344 0.507 0.501 0.666 0.939 20532
summary(app_train3$EXT_SOURCE_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01 0.33 0.51 0.50 0.68 0.96 173378
sum(is.na(app_test3$EXT_SOURCE_1))
## [1] 20532
sum(is.na(app_train3$EXT_SOURCE_1))
## [1] 173378
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_1[!is.na(app_train3$EXT_SOURCE_1)]
no_missing_test <- app_test3$EXT_SOURCE_1[!is.na(app_test3$EXT_SOURCE_1)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_1)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_1)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_1[is.na(app_train3$EXT_SOURCE_1)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_1[is.na(app_test3$EXT_SOURCE_1)] <- random_values_test
# Plot the distribution of EXT_SOURCE_1
hist(app_test3$EXT_SOURCE_1,
main = "Distribution of EXT_SOURCE_1 in test sample",
xlab = "EXT_SOURCE_1")
hist(app_train3$EXT_SOURCE_1,
main = "Distribution of EXT_SOURCE_1 in train sample",
xlab = "EXT_SOURCE_1")
# Summary statistics of EXT_SOURCE_2
summary(app_test3$EXT_SOURCE_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000008 0.408066 0.558758 0.518021 0.658497 0.855000 8
summary(app_train3$EXT_SOURCE_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.3925 0.5660 0.5144 0.6636 0.8550 660
sum(is.na(app_test3$EXT_SOURCE_2))
## [1] 8
sum(is.na(app_train3$EXT_SOURCE_2))
## [1] 660
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_2[!is.na(app_train3$EXT_SOURCE_2)]
no_missing_test <- app_test3$EXT_SOURCE_2[!is.na(app_test3$EXT_SOURCE_2)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_2)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_2)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_2[is.na(app_train3$EXT_SOURCE_2)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_2[is.na(app_test3$EXT_SOURCE_2)] <- random_values_test
# Plot the distribution of EXT_SOURCE_2
hist(app_test3$EXT_SOURCE_2,
main = "Distribution of EXT_SOURCE_2 in test sample",
xlab = "EXT_SOURCE_2")
hist(app_train3$EXT_SOURCE_2,
main = "Distribution of EXT_SOURCE_2 in train sample",
xlab = "EXT_SOURCE_2")
# Summary statistics of EXT_SOURCE_3
summary(app_test3$EXT_SOURCE_3)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.001 0.364 0.519 0.500 0.653 0.883 8668
summary(app_train3$EXT_SOURCE_3)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.37 0.54 0.51 0.67 0.90 60965
sum(is.na(app_test3$EXT_SOURCE_3))
## [1] 8668
sum(is.na(app_train3$EXT_SOURCE_3))
## [1] 60965
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_3[!is.na(app_train3$EXT_SOURCE_3)]
no_missing_test <- app_test3$EXT_SOURCE_3[!is.na(app_test3$EXT_SOURCE_3)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_3)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_3)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_3[is.na(app_train3$EXT_SOURCE_3)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_3[is.na(app_test3$EXT_SOURCE_3)] <- random_values_test
# Plot the distribution of EXT_SOURCE_3
hist(app_test3$EXT_SOURCE_3,
main = "Distribution of EXT_SOURCE_3 in test sample",
xlab = "EXT_SOURCE_3")
hist(app_train3$EXT_SOURCE_3,
main = "Distribution of EXT_SOURCE_3 in train sample",
xlab = "EXT_SOURCE_3")
# Summary statistics of AMT_REQ_CREDIT_BUREAU_QRT
summary(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.547 1.000 7.000 6049
summary(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 0.27 0.00 261.00 41519
sum(is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT))
## [1] 6049
sum(is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT))
## [1] 41519
# Create a subset of the data without missing values
no_missing_train <- app_train3$AMT_REQ_CREDIT_BUREAU_QRT[!is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)]
no_missing_test <- app_test3$AMT_REQ_CREDIT_BUREAU_QRT[!is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)]
# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)), replace = TRUE)
# Replace missing values with random values in the train dataset
app_train3$AMT_REQ_CREDIT_BUREAU_QRT[is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)] <- random_values_train
# Replace missing values with random values in the test dataset
app_test3$AMT_REQ_CREDIT_BUREAU_QRT[is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)] <- random_values_test
# Plot the distribution of AMT_REQ_CREDIT_BUREAU_QRT
hist(app_test3$AMT_REQ_CREDIT_BUREAU_QRT,
main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in test sample",
xlab = "AMT_REQ_CREDIT_BUREAU_QRT")
hist(app_train3$AMT_REQ_CREDIT_BUREAU_QRT,
main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in train sample",
xlab = "AMT_REQ_CREDIT_BUREAU_QRT")
Change to binary variable, where if there were a request it is 1, if not - 0.
app_test3$AMT_REQ_CREDIT_BUREAU_QRT <- ifelse(app_test3$AMT_REQ_CREDIT_BUREAU_QRT > 0, 1, 0)
app_train3$AMT_REQ_CREDIT_BUREAU_QRT <- ifelse(app_train3$AMT_REQ_CREDIT_BUREAU_QRT > 0, 1, 0)
hist(app_test3$AMT_REQ_CREDIT_BUREAU_QRT,
main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in test sample",
xlab = "AMT_REQ_CREDIT_BUREAU_QRT")
hist(app_train3$AMT_REQ_CREDIT_BUREAU_QRT,
main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in train sample",
xlab = "AMT_REQ_CREDIT_BUREAU_QRT")
str(app_train3)
## 'data.frame': 307511 obs. of 21 variables:
## $ SK_ID_CURR : num 1e+05 1e+05 1e+05 1e+05 1e+05 ...
## $ TARGET : num 1 0 0 0 0 0 0 0 0 0 ...
## $ NAME_CONTRACT_TYPE : int 0 0 1 0 0 0 0 0 0 1 ...
## $ CODE_GENDER : num 0 1 0 1 0 0 1 0 1 0 ...
## $ FLAG_OWN_CAR : num 0 0 1 0 0 0 1 1 0 0 ...
## $ FLAG_OWN_REALTY : num 1 0 1 1 1 1 1 1 1 1 ...
## $ CNT_CHILDREN : num 0 0 0 0 0 0 1 0 0 0 ...
## $ AMT_INCOME_TOTAL : num 202500 270000 67500 135000 121500 ...
## $ AMT_CREDIT : num 12.9 14.1 11.8 12.7 13.1 ...
## $ AMT_ANNUITY : num 10.11 10.48 8.82 10.3 9.99 ...
## $ AMT_GOODS_PRICE : num 12.8 13.9 11.8 12.6 13.1 ...
## $ NAME_TYPE_SUITE : num 0 1 0 0 0 1 0 0 1 0 ...
## $ NAME_EDUCATION_TYPE : num 0 1 0 0 0 0 1 1 0 0 ...
## $ NAME_FAMILY_STATUS : int 1 3 1 2 1 3 3 3 3 1 ...
## $ DAYS_EMPLOYED : num -637 -1188 -225 -3039 -3038 ...
## $ REGION_RATING_CLIENT : num 2 1 2 2 2 2 2 3 2 2 ...
## $ DOCUMENT : num 1 1 0 1 1 1 1 1 1 0 ...
## $ EXT_SOURCE_1 : num 0.083 0.311 0.659 0.34 0.774 ...
## $ EXT_SOURCE_2 : num 0.263 0.622 0.556 0.65 0.323 ...
## $ EXT_SOURCE_3 : num 0.139 0.202 0.73 0.774 0.569 ...
## $ AMT_REQ_CREDIT_BUREAU_QRT: num 0 0 0 0 0 1 1 0 0 0 ...
# Load the corrplot package
library(corrplot)
# Calculate correlation matrix
cor_matrix <- cor(app_train3)
# Create heatmap
heatmap(cor_matrix,
cmap = colorRampPalette(c("blue", "white", "red")),
main = "Correlation Matrix")
## Warning in plot.window(...): "cmap" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "cmap" is not a graphical parameter
## Warning in title(...): "cmap" is not a graphical parameter
app_train4<-app_train3
app_test4<-app_test3
#Karolina
# file_path <- "C:\\Users\\karla\\OneDrive\\Pulpit\\Credit Risk\\Projekt\\app_train4.csv"
# Save the dataframe as a CSV file
# write_csv(app_train4, file_path)
# file_path <- "C:\\Users\\karla\\OneDrive\\Pulpit\\Credit Risk\\Projekt\\app_test4.csv"
# write_csv(app_test4, file_path)
save(app_train4, file = "dane/app_train4.RData")
save(app_test4, file = "dane/app_test4.RData")